1 Import Libraries

library(rmarkdown)
library(dplyr)
library(ggplot2)
library(broom)
library(janitor)
library(renv)
library(purrr)
library(tm)
library(SnowballC)
library(RColorBrewer)
library(ggplot2)
library(wordcloud)
library(biclust)
library(cluster)
library(igraph)
library(fpc)
library(magrittr)
library(rmarkdown)
library(textreuse)
library(slam)


library(htmltools)
library(plotly)
library(klaR)
library(tidyr)
#packages <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc", "knitr", "dplyr", "broom", "janitor", "renv", "purrr")
#install.packages(packages, dependencies = TRUE)

2 Introduction

2.1 Describe Dataset

The dataset used for this project is president speeches obtained from this link.

Using the following script in Python, we first created a dataframe of the website’s speeches:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Scrapes transcripts for inaugural addresses


def get_urls(url):
    '''Returns list of transcript urls'''
    
    page = requests.get(url).text
    soup=BeautifulSoup(page, 'lxml')
    url_table = soup.find("table", class_='table').find_all("a")
    return [u["href"] for u in url_table]

urls = get_urls("https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/inaugural-addresses")

transcripts = pd.DataFrame()

def get_transcripts(urls, transcripts):
    for u in urls:
        page = requests.get(u).text
        soup = BeautifulSoup(page, 'lxml')
        t_president = soup.find("h3", class_="diet-title").text
        t_year = soup.find("span", class_="date-display-single").text.split(',')[1].strip()
        t_content = soup.find("div", class_="field-docs-content").text
        record = {
            'president' : t_president,
            'year' : t_year,
            'content' : t_content
        }
        transcripts = transcripts.append(record, ignore_index=True)
    return transcripts

data = get_transcripts(urls,transcripts)
data.to_csv("us_presidents_transcripts.csv", sep="|")

In what follows, we load the dataframe:

df <- read.csv("https://raw.githubusercontent.com/berserkhmdvhb/MADS-NLP/main/data/presidents-speech.csv")
df |> dplyr::glimpse()
## Rows: 59
## Columns: 4
## $ X         <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ president <chr> "George Washington", "George Washington", "John Adams", "Tho…
## $ year      <int> 1789, 1793, 1797, 1801, 1805, 1809, 1813, 1817, 1821, 1825, …
## $ content   <chr> "\nFellow-Citizens of the Senate and of the House of Represe…
df |> summary()
##        X         president              year        content         
##  Min.   : 0.0   Length:59          Min.   :1789   Length:59         
##  1st Qu.:14.5   Class :character   1st Qu.:1847   Class :character  
##  Median :29.0   Mode  :character   Median :1905   Mode  :character  
##  Mean   :29.0                      Mean   :1905                     
##  3rd Qu.:43.5                      3rd Qu.:1963                     
##  Max.   :58.0                      Max.   :2021

In what follows, text files are generated from each row of dataframe and are stored in “texts” folder:

#presidents <- df[["president"]]|> unique() |>as.list()

for(i in 1:nrow(df)) {       # for-loop over rows
  df_i <- df[i, ]
  name <- df_i$president
  year <- df_i$year
  text <- df_i$content
  file_name <- paste(as.character(year), 
                     as.character(name), 
                     sep="-")
  file_name <- paste(file_name, ".txt", 
                     sep="")
  loc <- paste("./data/texts/", file_name, sep="")
  #writeLines(text, loc)
}  
loc <- "./data/texts/"
docs <- tm::VCorpus(DirSource(loc)) 
summary(docs) 
##                                 Length Class             Mode
## 1789-George Washington.txt      2      PlainTextDocument list
## 1793-George Washington.txt      2      PlainTextDocument list
## 1797-John Adams.txt             2      PlainTextDocument list
## 1801-Thomas Jefferson.txt       2      PlainTextDocument list
## 1805-Thomas Jefferson.txt       2      PlainTextDocument list
## 1809-James Madison.txt          2      PlainTextDocument list
## 1813-James Madison.txt          2      PlainTextDocument list
## 1817-James Monroe.txt           2      PlainTextDocument list
## 1821-James Monroe.txt           2      PlainTextDocument list
## 1825-John Quincy Adams.txt      2      PlainTextDocument list
## 1829-Andrew Jackson.txt         2      PlainTextDocument list
## 1833-Andrew Jackson.txt         2      PlainTextDocument list
## 1837-Martin van Buren.txt       2      PlainTextDocument list
## 1841-William Henry Harrison.txt 2      PlainTextDocument list
## 1845-James K. Polk.txt          2      PlainTextDocument list
## 1849-Zachary Taylor.txt         2      PlainTextDocument list
## 1853-Franklin Pierce.txt        2      PlainTextDocument list
## 1857-James Buchanan.txt         2      PlainTextDocument list
## 1861-Abraham Lincoln.txt        2      PlainTextDocument list
## 1865-Abraham Lincoln.txt        2      PlainTextDocument list
## 1869-Ulysses S. Grant.txt       2      PlainTextDocument list
## 1873-Ulysses S. Grant.txt       2      PlainTextDocument list
## 1877-Rutherford B. Hayes.txt    2      PlainTextDocument list
## 1881-James A. Garfield.txt      2      PlainTextDocument list
## 1885-Grover Cleveland.txt       2      PlainTextDocument list
## 1889-Benjamin Harrison.txt      2      PlainTextDocument list
## 1893-Grover Cleveland.txt       2      PlainTextDocument list
## 1897-William McKinley.txt       2      PlainTextDocument list
## 1901-William McKinley.txt       2      PlainTextDocument list
## 1905-Theodore Roosevelt.txt     2      PlainTextDocument list
## 1909-William Howard Taft.txt    2      PlainTextDocument list
## 1913-Woodrow Wilson.txt         2      PlainTextDocument list
## 1917-Woodrow Wilson.txt         2      PlainTextDocument list
## 1921-Warren G. Harding.txt      2      PlainTextDocument list
## 1925-Calvin Coolidge.txt        2      PlainTextDocument list
## 1929-Herbert Hoover.txt         2      PlainTextDocument list
## 1933-Franklin D. Roosevelt.txt  2      PlainTextDocument list
## 1937-Franklin D. Roosevelt.txt  2      PlainTextDocument list
## 1941-Franklin D. Roosevelt.txt  2      PlainTextDocument list
## 1945-Franklin D. Roosevelt.txt  2      PlainTextDocument list
## 1949-Harry S. Truman.txt        2      PlainTextDocument list
## 1953-Dwight D. Eisenhower.txt   2      PlainTextDocument list
## 1957-Dwight D. Eisenhower.txt   2      PlainTextDocument list
## 1961-John F. Kennedy.txt        2      PlainTextDocument list
## 1965-Lyndon B. Johnson.txt      2      PlainTextDocument list
## 1969-Richard Nixon.txt          2      PlainTextDocument list
## 1973-Richard Nixon.txt          2      PlainTextDocument list
## 1977-Jimmy Carter.txt           2      PlainTextDocument list
## 1981-Ronald Reagan.txt          2      PlainTextDocument list
## 1985-Ronald Reagan.txt          2      PlainTextDocument list
## 1989-George Bush.txt            2      PlainTextDocument list
## 1993-William J. Clinton.txt     2      PlainTextDocument list
## 1997-William J. Clinton.txt     2      PlainTextDocument list
## 2001-George W. Bush.txt         2      PlainTextDocument list
## 2005-George W. Bush.txt         2      PlainTextDocument list
## 2009-Barack Obama.txt           2      PlainTextDocument list
## 2013-Barack Obama.txt           2      PlainTextDocument list
## 2017-Donald J. Trump.txt        2      PlainTextDocument list
## 2021-Joseph R. Biden.txt        2      PlainTextDocument list
inspect(docs[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 8617
writeLines(as.character(docs[1]))
## list(list(content = c("", "Fellow-Citizens of the Senate and of the House of Representatives:", "Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the 14th day of the present month. On the one hand, I was summoned by my country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years—a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could not but overwhelm with despondence one who (inheriting inferior endowments from nature and unpracticed in the duties of civil administration) ought to be peculiarly conscious of his own deficiencies. In this conflict of emotions all I dare aver is that it has been my faithful study to collect my duty from a just appreciation of every circumstance by which it might be affected. All I dare hope is that if, in executing this task, I have been too much swayed by a grateful remembrance of former instances, or by an affectionate sensibility to this transcendent proof of the confidence of my fellow-citizens, and have thence too little consulted my incapacity as well as disinclination for the weighty and untried cares before me, my error will be palliated by the motives which mislead [see APP note] me, and its consequences be judged by my country with some share of the partiality in which they originated.", 
## "Such being the impressions under which I have, in obedience to the public summons, repaired to the present station, it would be peculiarly improper to omit in this first official act my fervent supplications to that Almighty Being who rules over the universe, who presides in the councils of nations, and whose providential aids can supply every human defect, that His benediction may consecrate to the liberties and happiness of the people of the United States a Government instituted by themselves for these essential purposes, and may enable every instrument employed in its administration to execute with success the functions allotted to his charge. In tendering this homage to the Great Author of every public and private good, I assure myself that it expresses your sentiments not less than my own, nor those of my fellow-citizens at large less than either. No people can be bound to acknowledge and adore the Invisible Hand which conducts the affairs of men more than those of the United States. Every step by which they have advanced to the character of an independent nation seems to have been distinguished by some token of providential agency; and in the important revolution just accomplished in the system of their united government the tranquil deliberations and voluntary consent of so many distinct communities from which the event has resulted can not be compared with the means by which most governments have been established without some return of pious gratitude, along with an humble anticipation of the future blessings which the past seem to presage. These reflections, arising out of the present crisis, have forced themselves too strongly on my mind to be suppressed. You will join with me, I trust, in thinking that there are none under the influence of which the proceedings of a new and free government can more auspiciously commence.", 
## "By the article establishing the executive department it is made the duty of the President \"to recommend to your consideration such measures as he shall judge necessary and expedient.\" The circumstances under which I now meet you will acquit me from entering into that subject further than to refer to the great constitutional charter under which you are assembled, and which, in defining your powers, designates the objects to which your attention is to be given. It will be more consistent with those circumstances, and far more congenial with the feelings which actuate me, to substitute, in place of a recommendation of particular measures, the tribute that is due to the talents, the rectitude, and the patriotism which adorn the characters selected to devise and adopt them. In these honorable qualifications I behold the surest pledges that as on one side no local prejudices or attachments, no separate views nor party animosities, will misdirect the comprehensive and equal eye which ought to watch over this great assemblage of communities and interests, so, on another, that the foundation of our national policy will be laid in the pure and immutable principles of private morality, and the preeminence of free government be exemplified by all the attributes which can win the affections of its citizens and command the respect of the world. I dwell on this prospect with every satisfaction which an ardent love for my country can inspire, since there is no truth more thoroughly established than that there exists in the economy and course of nature an indissoluble union between virtue and happiness; between duty and advantage; between the genuine maxims of an honest and magnanimous policy and the solid rewards of public prosperity and felicity; since we ought to be no less persuaded that the propitious smiles of Heaven can never be expected on a nation that disregards the eternal rules of order and right which Heaven itself has ordained; and since the preservation of the sacred fire of liberty and the destiny of the republican model of government are justly considered, perhaps, as deeply, as finally, staked on the experiment entrusted to the hands of the American people.", 
## "Besides the ordinary objects submitted to your care, it will remain with your judgment to decide how far an exercise of the occasional power delegated by the fifth article of the Constitution is rendered expedient at the present juncture by the nature of objections which have been urged against the system, or by the degree of inquietude which has given birth to them. Instead of undertaking particular recommendations on this subject, in which I could be guided by no lights derived from official opportunities, I shall again give way to my entire confidence in your discernment and pursuit of the public good; for I assure myself that whilst you carefully avoid every alteration which might endanger the benefits of an united and effective government, or which ought to await the future lessons of experience, a reverence for the characteristic rights of freemen and a regard for the public harmony will sufficiently influence your deliberations on the question how far the former can be impregnably fortified or the latter be safely and advantageously promoted.", 
## "To the foregoing observations I have one to add, which will be most properly addressed to the House of Representatives. It concerns myself, and will therefore be as brief as possible. When I was first honored with a call into the service of my country, then on the eve of an arduous struggle for its liberties, the light in which I contemplated my duty required that I should renounce every pecuniary compensation. From this resolution I have in no instance departed; and being still under the impressions which produced it, I must decline as inapplicable to myself any share in the personal emoluments which may be indispensably included in a permanent provision for the executive department, and must accordingly pray that the pecuniary estimates for the station in which I am placed may during my continuance in it be limited to such actual expenditures as the public good may be thought to require.", 
## "Having thus imparted to you my sentiments as they have been awakened by the occasion which brings us together, I shall take my present leave; but not without resorting once more to the benign Parent of the Human Race in humble supplication that, since He has been pleased to favor the American people with opportunities for deliberating in perfect tranquillity, and dispositions for deciding with unparalleled unanimity on a form of government for the security of their union and the advancement of their happiness, so His divine blessing may be equally conspicuous in the enlarged views, the temperate consultations, and the wise measures on which the success of this Government must depend.", 
## ""), meta = list(author = character(0), datetimestamp = list(sec = 35.7147550582886, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = "1789-George Washington.txt", language = "en", origin = character(0))))
## list()
## list()

2.2 Goal and Procedure

This project is dedicated to investigating text similarity between speeches from different presidents of US during various years, starting from 1789 and ending with 2021.

In Preprocessing section, numerous text mining tasks are implemented on all the docs.

In Word Frequency section, frequency of different terms in documents are analyzed and visualized.

In Doc Similarity, similarity between documents is measured, analyzed, and visualized.

In Conclusion, main findings are summarized.

The github repository for this package can be found in this link

3 Preprocessing

The tm is a framework for text mining applications within R. Most functions used henceforth stems from this package.

3.1 Remove punctuation

docs <- tm::tm_map(docs,removePunctuation)   
writeLines(as.character(docs[1])) 
## list(list(content = c("", "FellowCitizens of the Senate and of the House of Representatives", "Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order and received on the 14th day of the present month On the one hand I was summoned by my country whose voice I can never hear but with veneration and love from a retreat which I had chosen with the fondest predilection and in my flattering hopes with an immutable decision as the asylum of my declining years—a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination and of frequent interruptions in my health to the gradual waste committed on it by time On the other hand the magnitude and difficulty of the trust to which the voice of my country called me being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications could not but overwhelm with despondence one who inheriting inferior endowments from nature and unpracticed in the duties of civil administration ought to be peculiarly conscious of his own deficiencies In this conflict of emotions all I dare aver is that it has been my faithful study to collect my duty from a just appreciation of every circumstance by which it might be affected All I dare hope is that if in executing this task I have been too much swayed by a grateful remembrance of former instances or by an affectionate sensibility to this transcendent proof of the confidence of my fellowcitizens and have thence too little consulted my incapacity as well as disinclination for the weighty and untried cares before me my error will be palliated by the motives which mislead see APP note me and its consequences be judged by my country with some share of the partiality in which they originated", 
## "Such being the impressions under which I have in obedience to the public summons repaired to the present station it would be peculiarly improper to omit in this first official act my fervent supplications to that Almighty Being who rules over the universe who presides in the councils of nations and whose providential aids can supply every human defect that His benediction may consecrate to the liberties and happiness of the people of the United States a Government instituted by themselves for these essential purposes and may enable every instrument employed in its administration to execute with success the functions allotted to his charge In tendering this homage to the Great Author of every public and private good I assure myself that it expresses your sentiments not less than my own nor those of my fellowcitizens at large less than either No people can be bound to acknowledge and adore the Invisible Hand which conducts the affairs of men more than those of the United States Every step by which they have advanced to the character of an independent nation seems to have been distinguished by some token of providential agency and in the important revolution just accomplished in the system of their united government the tranquil deliberations and voluntary consent of so many distinct communities from which the event has resulted can not be compared with the means by which most governments have been established without some return of pious gratitude along with an humble anticipation of the future blessings which the past seem to presage These reflections arising out of the present crisis have forced themselves too strongly on my mind to be suppressed You will join with me I trust in thinking that there are none under the influence of which the proceedings of a new and free government can more auspiciously commence", 
## "By the article establishing the executive department it is made the duty of the President to recommend to your consideration such measures as he shall judge necessary and expedient The circumstances under which I now meet you will acquit me from entering into that subject further than to refer to the great constitutional charter under which you are assembled and which in defining your powers designates the objects to which your attention is to be given It will be more consistent with those circumstances and far more congenial with the feelings which actuate me to substitute in place of a recommendation of particular measures the tribute that is due to the talents the rectitude and the patriotism which adorn the characters selected to devise and adopt them In these honorable qualifications I behold the surest pledges that as on one side no local prejudices or attachments no separate views nor party animosities will misdirect the comprehensive and equal eye which ought to watch over this great assemblage of communities and interests so on another that the foundation of our national policy will be laid in the pure and immutable principles of private morality and the preeminence of free government be exemplified by all the attributes which can win the affections of its citizens and command the respect of the world I dwell on this prospect with every satisfaction which an ardent love for my country can inspire since there is no truth more thoroughly established than that there exists in the economy and course of nature an indissoluble union between virtue and happiness between duty and advantage between the genuine maxims of an honest and magnanimous policy and the solid rewards of public prosperity and felicity since we ought to be no less persuaded that the propitious smiles of Heaven can never be expected on a nation that disregards the eternal rules of order and right which Heaven itself has ordained and since the preservation of the sacred fire of liberty and the destiny of the republican model of government are justly considered perhaps as deeply as finally staked on the experiment entrusted to the hands of the American people", 
## "Besides the ordinary objects submitted to your care it will remain with your judgment to decide how far an exercise of the occasional power delegated by the fifth article of the Constitution is rendered expedient at the present juncture by the nature of objections which have been urged against the system or by the degree of inquietude which has given birth to them Instead of undertaking particular recommendations on this subject in which I could be guided by no lights derived from official opportunities I shall again give way to my entire confidence in your discernment and pursuit of the public good for I assure myself that whilst you carefully avoid every alteration which might endanger the benefits of an united and effective government or which ought to await the future lessons of experience a reverence for the characteristic rights of freemen and a regard for the public harmony will sufficiently influence your deliberations on the question how far the former can be impregnably fortified or the latter be safely and advantageously promoted", 
## "To the foregoing observations I have one to add which will be most properly addressed to the House of Representatives It concerns myself and will therefore be as brief as possible When I was first honored with a call into the service of my country then on the eve of an arduous struggle for its liberties the light in which I contemplated my duty required that I should renounce every pecuniary compensation From this resolution I have in no instance departed and being still under the impressions which produced it I must decline as inapplicable to myself any share in the personal emoluments which may be indispensably included in a permanent provision for the executive department and must accordingly pray that the pecuniary estimates for the station in which I am placed may during my continuance in it be limited to such actual expenditures as the public good may be thought to require", 
## "Having thus imparted to you my sentiments as they have been awakened by the occasion which brings us together I shall take my present leave but not without resorting once more to the benign Parent of the Human Race in humble supplication that since He has been pleased to favor the American people with opportunities for deliberating in perfect tranquillity and dispositions for deciding with unparalleled unanimity on a form of government for the security of their union and the advancement of their happiness so His divine blessing may be equally conspicuous in the enlarged views the temperate consultations and the wise measures on which the success of this Government must depend", 
## ""), meta = list(author = character(0), datetimestamp = list(sec = 35.7147550582886, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = "1789-George Washington.txt", language = "en", origin = character(0))))
## list()
## list()

3.2 Remove special characters

for (j in seq(docs)) {
    docs[[j]] <- gsub("/", " ", docs[[j]])
    docs[[j]] <- gsub("@", " ", docs[[j]])
    docs[[j]] <- gsub("\\|", " ", docs[[j]])
    docs[[j]] <- gsub("\u2028", " ", docs[[j]])  # This is an ascii character that did not translate, so it had to be removed.
}
writeLines(as.character(docs[1]))
## list(c("", "FellowCitizens of the Senate and of the House of Representatives", "Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order and received on the 14th day of the present month On the one hand I was summoned by my country whose voice I can never hear but with veneration and love from a retreat which I had chosen with the fondest predilection and in my flattering hopes with an immutable decision as the asylum of my declining years—a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination and of frequent interruptions in my health to the gradual waste committed on it by time On the other hand the magnitude and difficulty of the trust to which the voice of my country called me being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications could not but overwhelm with despondence one who inheriting inferior endowments from nature and unpracticed in the duties of civil administration ought to be peculiarly conscious of his own deficiencies In this conflict of emotions all I dare aver is that it has been my faithful study to collect my duty from a just appreciation of every circumstance by which it might be affected All I dare hope is that if in executing this task I have been too much swayed by a grateful remembrance of former instances or by an affectionate sensibility to this transcendent proof of the confidence of my fellowcitizens and have thence too little consulted my incapacity as well as disinclination for the weighty and untried cares before me my error will be palliated by the motives which mislead see APP note me and its consequences be judged by my country with some share of the partiality in which they originated", 
## "Such being the impressions under which I have in obedience to the public summons repaired to the present station it would be peculiarly improper to omit in this first official act my fervent supplications to that Almighty Being who rules over the universe who presides in the councils of nations and whose providential aids can supply every human defect that His benediction may consecrate to the liberties and happiness of the people of the United States a Government instituted by themselves for these essential purposes and may enable every instrument employed in its administration to execute with success the functions allotted to his charge In tendering this homage to the Great Author of every public and private good I assure myself that it expresses your sentiments not less than my own nor those of my fellowcitizens at large less than either No people can be bound to acknowledge and adore the Invisible Hand which conducts the affairs of men more than those of the United States Every step by which they have advanced to the character of an independent nation seems to have been distinguished by some token of providential agency and in the important revolution just accomplished in the system of their united government the tranquil deliberations and voluntary consent of so many distinct communities from which the event has resulted can not be compared with the means by which most governments have been established without some return of pious gratitude along with an humble anticipation of the future blessings which the past seem to presage These reflections arising out of the present crisis have forced themselves too strongly on my mind to be suppressed You will join with me I trust in thinking that there are none under the influence of which the proceedings of a new and free government can more auspiciously commence", 
## "By the article establishing the executive department it is made the duty of the President to recommend to your consideration such measures as he shall judge necessary and expedient The circumstances under which I now meet you will acquit me from entering into that subject further than to refer to the great constitutional charter under which you are assembled and which in defining your powers designates the objects to which your attention is to be given It will be more consistent with those circumstances and far more congenial with the feelings which actuate me to substitute in place of a recommendation of particular measures the tribute that is due to the talents the rectitude and the patriotism which adorn the characters selected to devise and adopt them In these honorable qualifications I behold the surest pledges that as on one side no local prejudices or attachments no separate views nor party animosities will misdirect the comprehensive and equal eye which ought to watch over this great assemblage of communities and interests so on another that the foundation of our national policy will be laid in the pure and immutable principles of private morality and the preeminence of free government be exemplified by all the attributes which can win the affections of its citizens and command the respect of the world I dwell on this prospect with every satisfaction which an ardent love for my country can inspire since there is no truth more thoroughly established than that there exists in the economy and course of nature an indissoluble union between virtue and happiness between duty and advantage between the genuine maxims of an honest and magnanimous policy and the solid rewards of public prosperity and felicity since we ought to be no less persuaded that the propitious smiles of Heaven can never be expected on a nation that disregards the eternal rules of order and right which Heaven itself has ordained and since the preservation of the sacred fire of liberty and the destiny of the republican model of government are justly considered perhaps as deeply as finally staked on the experiment entrusted to the hands of the American people", 
## "Besides the ordinary objects submitted to your care it will remain with your judgment to decide how far an exercise of the occasional power delegated by the fifth article of the Constitution is rendered expedient at the present juncture by the nature of objections which have been urged against the system or by the degree of inquietude which has given birth to them Instead of undertaking particular recommendations on this subject in which I could be guided by no lights derived from official opportunities I shall again give way to my entire confidence in your discernment and pursuit of the public good for I assure myself that whilst you carefully avoid every alteration which might endanger the benefits of an united and effective government or which ought to await the future lessons of experience a reverence for the characteristic rights of freemen and a regard for the public harmony will sufficiently influence your deliberations on the question how far the former can be impregnably fortified or the latter be safely and advantageously promoted", 
## "To the foregoing observations I have one to add which will be most properly addressed to the House of Representatives It concerns myself and will therefore be as brief as possible When I was first honored with a call into the service of my country then on the eve of an arduous struggle for its liberties the light in which I contemplated my duty required that I should renounce every pecuniary compensation From this resolution I have in no instance departed and being still under the impressions which produced it I must decline as inapplicable to myself any share in the personal emoluments which may be indispensably included in a permanent provision for the executive department and must accordingly pray that the pecuniary estimates for the station in which I am placed may during my continuance in it be limited to such actual expenditures as the public good may be thought to require", 
## "Having thus imparted to you my sentiments as they have been awakened by the occasion which brings us together I shall take my present leave but not without resorting once more to the benign Parent of the Human Race in humble supplication that since He has been pleased to favor the American people with opportunities for deliberating in perfect tranquillity and dispositions for deciding with unparalleled unanimity on a form of government for the security of their union and the advancement of their happiness so His divine blessing may be equally conspicuous in the enlarged views the temperate consultations and the wise measures on which the success of this Government must depend", 
## ""))
## list()
## list()

3.3 Remove numbers

docs <- tm::tm_map(docs, removeNumbers)   
writeLines(as.character(docs[1])) 
## list(c("", "FellowCitizens of the Senate and of the House of Representatives", "Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order and received on the th day of the present month On the one hand I was summoned by my country whose voice I can never hear but with veneration and love from a retreat which I had chosen with the fondest predilection and in my flattering hopes with an immutable decision as the asylum of my declining years—a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination and of frequent interruptions in my health to the gradual waste committed on it by time On the other hand the magnitude and difficulty of the trust to which the voice of my country called me being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications could not but overwhelm with despondence one who inheriting inferior endowments from nature and unpracticed in the duties of civil administration ought to be peculiarly conscious of his own deficiencies In this conflict of emotions all I dare aver is that it has been my faithful study to collect my duty from a just appreciation of every circumstance by which it might be affected All I dare hope is that if in executing this task I have been too much swayed by a grateful remembrance of former instances or by an affectionate sensibility to this transcendent proof of the confidence of my fellowcitizens and have thence too little consulted my incapacity as well as disinclination for the weighty and untried cares before me my error will be palliated by the motives which mislead see APP note me and its consequences be judged by my country with some share of the partiality in which they originated", 
## "Such being the impressions under which I have in obedience to the public summons repaired to the present station it would be peculiarly improper to omit in this first official act my fervent supplications to that Almighty Being who rules over the universe who presides in the councils of nations and whose providential aids can supply every human defect that His benediction may consecrate to the liberties and happiness of the people of the United States a Government instituted by themselves for these essential purposes and may enable every instrument employed in its administration to execute with success the functions allotted to his charge In tendering this homage to the Great Author of every public and private good I assure myself that it expresses your sentiments not less than my own nor those of my fellowcitizens at large less than either No people can be bound to acknowledge and adore the Invisible Hand which conducts the affairs of men more than those of the United States Every step by which they have advanced to the character of an independent nation seems to have been distinguished by some token of providential agency and in the important revolution just accomplished in the system of their united government the tranquil deliberations and voluntary consent of so many distinct communities from which the event has resulted can not be compared with the means by which most governments have been established without some return of pious gratitude along with an humble anticipation of the future blessings which the past seem to presage These reflections arising out of the present crisis have forced themselves too strongly on my mind to be suppressed You will join with me I trust in thinking that there are none under the influence of which the proceedings of a new and free government can more auspiciously commence", 
## "By the article establishing the executive department it is made the duty of the President to recommend to your consideration such measures as he shall judge necessary and expedient The circumstances under which I now meet you will acquit me from entering into that subject further than to refer to the great constitutional charter under which you are assembled and which in defining your powers designates the objects to which your attention is to be given It will be more consistent with those circumstances and far more congenial with the feelings which actuate me to substitute in place of a recommendation of particular measures the tribute that is due to the talents the rectitude and the patriotism which adorn the characters selected to devise and adopt them In these honorable qualifications I behold the surest pledges that as on one side no local prejudices or attachments no separate views nor party animosities will misdirect the comprehensive and equal eye which ought to watch over this great assemblage of communities and interests so on another that the foundation of our national policy will be laid in the pure and immutable principles of private morality and the preeminence of free government be exemplified by all the attributes which can win the affections of its citizens and command the respect of the world I dwell on this prospect with every satisfaction which an ardent love for my country can inspire since there is no truth more thoroughly established than that there exists in the economy and course of nature an indissoluble union between virtue and happiness between duty and advantage between the genuine maxims of an honest and magnanimous policy and the solid rewards of public prosperity and felicity since we ought to be no less persuaded that the propitious smiles of Heaven can never be expected on a nation that disregards the eternal rules of order and right which Heaven itself has ordained and since the preservation of the sacred fire of liberty and the destiny of the republican model of government are justly considered perhaps as deeply as finally staked on the experiment entrusted to the hands of the American people", 
## "Besides the ordinary objects submitted to your care it will remain with your judgment to decide how far an exercise of the occasional power delegated by the fifth article of the Constitution is rendered expedient at the present juncture by the nature of objections which have been urged against the system or by the degree of inquietude which has given birth to them Instead of undertaking particular recommendations on this subject in which I could be guided by no lights derived from official opportunities I shall again give way to my entire confidence in your discernment and pursuit of the public good for I assure myself that whilst you carefully avoid every alteration which might endanger the benefits of an united and effective government or which ought to await the future lessons of experience a reverence for the characteristic rights of freemen and a regard for the public harmony will sufficiently influence your deliberations on the question how far the former can be impregnably fortified or the latter be safely and advantageously promoted", 
## "To the foregoing observations I have one to add which will be most properly addressed to the House of Representatives It concerns myself and will therefore be as brief as possible When I was first honored with a call into the service of my country then on the eve of an arduous struggle for its liberties the light in which I contemplated my duty required that I should renounce every pecuniary compensation From this resolution I have in no instance departed and being still under the impressions which produced it I must decline as inapplicable to myself any share in the personal emoluments which may be indispensably included in a permanent provision for the executive department and must accordingly pray that the pecuniary estimates for the station in which I am placed may during my continuance in it be limited to such actual expenditures as the public good may be thought to require", 
## "Having thus imparted to you my sentiments as they have been awakened by the occasion which brings us together I shall take my present leave but not without resorting once more to the benign Parent of the Human Race in humble supplication that since He has been pleased to favor the American people with opportunities for deliberating in perfect tranquillity and dispositions for deciding with unparalleled unanimity on a form of government for the security of their union and the advancement of their happiness so His divine blessing may be equally conspicuous in the enlarged views the temperate consultations and the wise measures on which the success of this Government must depend", 
## ""))
## list()
## list()

3.4 Convert to lowercase

docs <- tm::tm_map(docs, tolower)   
docs <- tm::tm_map(docs, PlainTextDocument)
DocsCopy <- docs
writeLines(as.character(docs[1])) 
## list(list(content = c("", "fellowcitizens of the senate and of the house of representatives", "among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order and received on the th day of the present month on the one hand i was summoned by my country whose voice i can never hear but with veneration and love from a retreat which i had chosen with the fondest predilection and in my flattering hopes with an immutable decision as the asylum of my declining years—a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination and of frequent interruptions in my health to the gradual waste committed on it by time on the other hand the magnitude and difficulty of the trust to which the voice of my country called me being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications could not but overwhelm with despondence one who inheriting inferior endowments from nature and unpracticed in the duties of civil administration ought to be peculiarly conscious of his own deficiencies in this conflict of emotions all i dare aver is that it has been my faithful study to collect my duty from a just appreciation of every circumstance by which it might be affected all i dare hope is that if in executing this task i have been too much swayed by a grateful remembrance of former instances or by an affectionate sensibility to this transcendent proof of the confidence of my fellowcitizens and have thence too little consulted my incapacity as well as disinclination for the weighty and untried cares before me my error will be palliated by the motives which mislead see app note me and its consequences be judged by my country with some share of the partiality in which they originated", 
## "such being the impressions under which i have in obedience to the public summons repaired to the present station it would be peculiarly improper to omit in this first official act my fervent supplications to that almighty being who rules over the universe who presides in the councils of nations and whose providential aids can supply every human defect that his benediction may consecrate to the liberties and happiness of the people of the united states a government instituted by themselves for these essential purposes and may enable every instrument employed in its administration to execute with success the functions allotted to his charge in tendering this homage to the great author of every public and private good i assure myself that it expresses your sentiments not less than my own nor those of my fellowcitizens at large less than either no people can be bound to acknowledge and adore the invisible hand which conducts the affairs of men more than those of the united states every step by which they have advanced to the character of an independent nation seems to have been distinguished by some token of providential agency and in the important revolution just accomplished in the system of their united government the tranquil deliberations and voluntary consent of so many distinct communities from which the event has resulted can not be compared with the means by which most governments have been established without some return of pious gratitude along with an humble anticipation of the future blessings which the past seem to presage these reflections arising out of the present crisis have forced themselves too strongly on my mind to be suppressed you will join with me i trust in thinking that there are none under the influence of which the proceedings of a new and free government can more auspiciously commence", 
## "by the article establishing the executive department it is made the duty of the president to recommend to your consideration such measures as he shall judge necessary and expedient the circumstances under which i now meet you will acquit me from entering into that subject further than to refer to the great constitutional charter under which you are assembled and which in defining your powers designates the objects to which your attention is to be given it will be more consistent with those circumstances and far more congenial with the feelings which actuate me to substitute in place of a recommendation of particular measures the tribute that is due to the talents the rectitude and the patriotism which adorn the characters selected to devise and adopt them in these honorable qualifications i behold the surest pledges that as on one side no local prejudices or attachments no separate views nor party animosities will misdirect the comprehensive and equal eye which ought to watch over this great assemblage of communities and interests so on another that the foundation of our national policy will be laid in the pure and immutable principles of private morality and the preeminence of free government be exemplified by all the attributes which can win the affections of its citizens and command the respect of the world i dwell on this prospect with every satisfaction which an ardent love for my country can inspire since there is no truth more thoroughly established than that there exists in the economy and course of nature an indissoluble union between virtue and happiness between duty and advantage between the genuine maxims of an honest and magnanimous policy and the solid rewards of public prosperity and felicity since we ought to be no less persuaded that the propitious smiles of heaven can never be expected on a nation that disregards the eternal rules of order and right which heaven itself has ordained and since the preservation of the sacred fire of liberty and the destiny of the republican model of government are justly considered perhaps as deeply as finally staked on the experiment entrusted to the hands of the american people", 
## "besides the ordinary objects submitted to your care it will remain with your judgment to decide how far an exercise of the occasional power delegated by the fifth article of the constitution is rendered expedient at the present juncture by the nature of objections which have been urged against the system or by the degree of inquietude which has given birth to them instead of undertaking particular recommendations on this subject in which i could be guided by no lights derived from official opportunities i shall again give way to my entire confidence in your discernment and pursuit of the public good for i assure myself that whilst you carefully avoid every alteration which might endanger the benefits of an united and effective government or which ought to await the future lessons of experience a reverence for the characteristic rights of freemen and a regard for the public harmony will sufficiently influence your deliberations on the question how far the former can be impregnably fortified or the latter be safely and advantageously promoted", 
## "to the foregoing observations i have one to add which will be most properly addressed to the house of representatives it concerns myself and will therefore be as brief as possible when i was first honored with a call into the service of my country then on the eve of an arduous struggle for its liberties the light in which i contemplated my duty required that i should renounce every pecuniary compensation from this resolution i have in no instance departed and being still under the impressions which produced it i must decline as inapplicable to myself any share in the personal emoluments which may be indispensably included in a permanent provision for the executive department and must accordingly pray that the pecuniary estimates for the station in which i am placed may during my continuance in it be limited to such actual expenditures as the public good may be thought to require", 
## "having thus imparted to you my sentiments as they have been awakened by the occasion which brings us together i shall take my present leave but not without resorting once more to the benign parent of the human race in humble supplication that since he has been pleased to favor the american people with opportunities for deliberating in perfect tranquillity and dispositions for deciding with unparalleled unanimity on a form of government for the security of their union and the advancement of their happiness so his divine blessing may be equally conspicuous in the enlarged views the temperate consultations and the wise measures on which the success of this government must depend", 
## ""), meta = list(author = character(0), datetimestamp = list(sec = 35.854508638382, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0))))
## list()
## list()

3.5 Remove “stopwords”

# For a list of the stopwords, see:   
length(stopwords("english"))   
## [1] 174
docs <- tm::tm_map(docs, removeWords, stopwords("english"))   
docs <- tm::tm_map(docs, PlainTextDocument)
writeLines(as.character(docs[1]))
## list(list(content = c("", "fellowcitizens   senate    house  representatives", "among  vicissitudes incident  life  event   filled   greater anxieties      notification  transmitted   order  received   th day   present month   one hand   summoned   country whose voice  can never hear   veneration  love   retreat    chosen   fondest predilection    flattering hopes   immutable decision   asylum   declining years— retreat   rendered every day  necessary  well   dear     addition  habit  inclination   frequent interruptions   health   gradual waste committed    time    hand  magnitude  difficulty   trust    voice   country called   sufficient  awaken   wisest   experienced   citizens  distrustful scrutiny   qualifications    overwhelm  despondence one  inheriting inferior endowments  nature  unpracticed   duties  civil administration    peculiarly conscious    deficiencies   conflict  emotions   dare aver       faithful study  collect  duty   just appreciation  every circumstance    might  affected   dare hope     executing  task     much swayed   grateful remembrance  former instances    affectionate sensibility   transcendent proof   confidence   fellowcitizens   thence  little consulted  incapacity  well  disinclination   weighty  untried cares    error will  palliated   motives  mislead see app note    consequences  judged   country   share   partiality    originated", 
## "   impressions      obedience   public summons repaired   present station    peculiarly improper  omit   first official act  fervent supplications   almighty   rules   universe  presides   councils  nations  whose providential aids can supply every human defect   benediction may consecrate   liberties  happiness   people   united states  government instituted     essential purposes  may enable every instrument employed   administration  execute  success  functions allotted   charge  tendering  homage   great author  every public  private good  assure    expresses  sentiments  less        fellowcitizens  large less  either  people can  bound  acknowledge  adore  invisible hand  conducts  affairs  men      united states every step     advanced   character   independent nation seems    distinguished   token  providential agency    important revolution just accomplished   system   united government  tranquil deliberations  voluntary consent   many distinct communities    event  resulted can   compared   means    governments   established without  return  pious gratitude along   humble anticipation   future blessings   past seem  presage  reflections arising    present crisis  forced   strongly   mind   suppressed  will join    trust  thinking    none   influence    proceedings   new  free government can  auspiciously commence", 
## "  article establishing  executive department   made  duty   president  recommend   consideration  measures   shall judge necessary  expedient  circumstances    now meet  will acquit   entering   subject    refer   great constitutional charter     assembled    defining  powers designates  objects    attention    given  will   consistent   circumstances  far  congenial   feelings  actuate   substitute  place   recommendation  particular measures  tribute   due   talents  rectitude   patriotism  adorn  characters selected  devise  adopt    honorable qualifications  behold  surest pledges    one side  local prejudices  attachments  separate views  party animosities will misdirect  comprehensive  equal eye    watch   great assemblage  communities  interests   another   foundation   national policy will  laid   pure  immutable principles  private morality   preeminence  free government  exemplified    attributes  can win  affections   citizens  command  respect   world  dwell   prospect  every satisfaction   ardent love   country can inspire since    truth  thoroughly established    exists   economy  course  nature  indissoluble union  virtue  happiness  duty  advantage   genuine maxims   honest  magnanimous policy   solid rewards  public prosperity  felicity since      less persuaded   propitious smiles  heaven can never  expected   nation  disregards  eternal rules  order  right  heaven   ordained  since  preservation   sacred fire  liberty   destiny   republican model  government  justly considered perhaps  deeply  finally staked   experiment entrusted   hands   american people", 
## "besides  ordinary objects submitted   care  will remain   judgment  decide  far  exercise   occasional power delegated   fifth article   constitution  rendered expedient   present juncture   nature  objections    urged   system    degree  inquietude   given birth   instead  undertaking particular recommendations   subject      guided   lights derived  official opportunities  shall  give way   entire confidence   discernment  pursuit   public good   assure   whilst  carefully avoid every alteration  might endanger  benefits   united  effective government     await  future lessons  experience  reverence   characteristic rights  freemen   regard   public harmony will sufficiently influence  deliberations   question  far  former can  impregnably fortified   latter  safely  advantageously promoted", 
## "  foregoing observations   one  add  will   properly addressed   house  representatives  concerns   will therefore   brief  possible    first honored   call   service   country    eve   arduous struggle   liberties  light    contemplated  duty required    renounce every pecuniary compensation   resolution     instance departed   still   impressions  produced   must decline  inapplicable    share   personal emoluments  may  indispensably included   permanent provision   executive department  must accordingly pray   pecuniary estimates   station     placed may   continuance    limited   actual expenditures   public good may  thought  require", 
## " thus imparted    sentiments     awakened   occasion  brings us together  shall take  present leave   without resorting     benign parent   human race  humble supplication  since    pleased  favor  american people  opportunities  deliberating  perfect tranquillity  dispositions  deciding  unparalleled unanimity   form  government   security   union   advancement   happiness   divine blessing may  equally conspicuous   enlarged views  temperate consultations   wise measures    success   government must depend", 
## ""), meta = list(author = character(0), datetimestamp = list(sec = 36.0978336334229, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0))))
## list()
## list()

3.6 Remove particular stopwords

#docs <- tm::tm_map(docs, removeWords, c("syllogism", "tautology"))   
# Just remove the words "syllogism" and "tautology". 
# These words don't actually exist in these texts. But this is how you would remove them if they had.

3.7 Retain compouned words

If you wish to preserve a concept is only apparent as a collection of two or more words, then you can combine them or reduce them to a meaningful acronym before you begin the analysis. Here, I am using examples that are particular to qualitative data analysis.

for (j in seq(docs))
{
  docs[[j]] <- gsub("fake news", "fake_news", docs[[j]])
  docs[[j]] <- gsub("inner city", "inner-city", docs[[j]])
  docs[[j]] <- gsub("politically correct", "politically_correct", docs[[j]])
}
docs <- tm_map(docs, PlainTextDocument)

3.8 Remove common word endings

Common words ending e.g. “ing”, “es”, “s”

## Note: I did not run this section of code for this particular example.
docs_st <- tm_map(docs, stemDocument)   
docs_st <- tm_map(docs_st, PlainTextDocument)
writeLines(as.character(docs_st[1])) # Check to see if it worked.
## list(list(content = c("", "fellowcitizen senat hous repres", "among vicissitud incid life event fill greater anxieti notif transmit order receiv th day present month one hand summon countri whose voic can never hear vener love retreat chosen fondest predilect flatter hope immut decis asylum declin years— retreat render everi day necessari well dear addit habit inclin frequent interrupt health gradual wast commit time hand magnitud difficulti trust voic countri call suffici awaken wisest experienc citizen distrust scrutini qualif overwhelm despond one inherit inferior endow natur unpract duti civil administr peculiar conscious defici conflict emot dare aver faith studi collect duti just appreci everi circumst might affect dare hope execut task much sway grate remembr former instanc affection sensibl transcend proof confid fellowcitizen thenc littl consult incapac well disinclin weighti untri care error will palliat motiv mislead see app note consequ judg countri share partial origin", 
## "impress obedi public summon repair present station peculiar improp omit first offici act fervent supplic almighti rule univers presid council nation whose providenti aid can suppli everi human defect benedict may consecr liberti happi peopl unit state govern institut essenti purpos may enabl everi instrument employ administr execut success function allot charg tender homag great author everi public privat good assur express sentiment less fellowcitizen larg less either peopl can bound acknowledg ador invis hand conduct affair men unit state everi step advanc charact independ nation seem distinguish token providenti agenc import revolut just accomplish system unit govern tranquil deliber voluntari consent mani distinct communiti event result can compar mean govern establish without return pious gratitud along humbl anticip futur bless past seem presag reflect aris present crisi forc strong mind suppress will join trust think none influenc proceed new free govern can auspici commenc", 
## "articl establish execut depart made duti presid recommend consider measur shall judg necessari expedi circumst now meet will acquit enter subject refer great constitut charter assembl defin power design object attent given will consist circumst far congeni feel actuat substitut place recommend particular measur tribut due talent rectitud patriot adorn charact select devis adopt honor qualif behold surest pledg one side local prejudic attach separ view parti animos will misdirect comprehens equal eye watch great assemblag communiti interest anoth foundat nation polici will laid pure immut principl privat moral preemin free govern exemplifi attribut can win affect citizen command respect world dwell prospect everi satisfact ardent love countri can inspir sinc truth thorough establish exist economi cours natur indissolubl union virtu happi duti advantag genuin maxim honest magnanim polici solid reward public prosper felic sinc less persuad propiti smile heaven can never expect nation disregard etern rule order right heaven ordain sinc preserv sacr fire liberti destini republican model govern just consid perhap deepli final stake experi entrust hand american peopl", 
## "besid ordinari object submit care will remain judgment decid far exercis occasion power deleg fifth articl constitut render expedi present junctur natur object urg system degre inquietud given birth instead undertak particular recommend subject guid light deriv offici opportun shall give way entir confid discern pursuit public good assur whilst care avoid everi alter might endang benefit unit effect govern await futur lesson experi rever characterist right freemen regard public harmoni will suffici influenc deliber question far former can impregn fortifi latter safe advantag promot", 
## "forego observ one add will proper address hous repres concern will therefor brief possibl first honor call servic countri eve arduous struggl liberti light contempl duti requir renounc everi pecuniari compens resolut instanc depart still impress produc must declin inapplic share person emolu may indispens includ perman provis execut depart must accord pray pecuniari estim station place may continu limit actual expenditur public good may thought requir", "thus impart sentiment awaken occas bring us togeth shall take present leav without resort benign parent human race humbl supplic sinc pleas favor american peopl opportun deliber perfect tranquil disposit decid unparallel unanim form govern secur union advanc happi divin bless may equal conspicu enlarg view temper consult wise measur success govern must depend", 
## ""), meta = list(author = character(0), datetimestamp = list(sec = 36.2355585098267, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0))))
## list()
## list()
# docs <- docs_st

3.9 Strip unnecesary whitespace

docs <- tm_map(docs, stripWhitespace)
writeLines(as.character(docs[1]))
## list(list(content = c("", "fellowcitizens senate house representatives", "among vicissitudes incident life event filled greater anxieties notification transmitted order received th day present month one hand summoned country whose voice can never hear veneration love retreat chosen fondest predilection flattering hopes immutable decision asylum declining years— retreat rendered every day necessary well dear addition habit inclination frequent interruptions health gradual waste committed time hand magnitude difficulty trust voice country called sufficient awaken wisest experienced citizens distrustful scrutiny qualifications overwhelm despondence one inheriting inferior endowments nature unpracticed duties civil administration peculiarly conscious deficiencies conflict emotions dare aver faithful study collect duty just appreciation every circumstance might affected dare hope executing task much swayed grateful remembrance former instances affectionate sensibility transcendent proof confidence fellowcitizens thence little consulted incapacity well disinclination weighty untried cares error will palliated motives mislead see app note consequences judged country share partiality originated", 
## " impressions obedience public summons repaired present station peculiarly improper omit first official act fervent supplications almighty rules universe presides councils nations whose providential aids can supply every human defect benediction may consecrate liberties happiness people united states government instituted essential purposes may enable every instrument employed administration execute success functions allotted charge tendering homage great author every public private good assure expresses sentiments less fellowcitizens large less either people can bound acknowledge adore invisible hand conducts affairs men united states every step advanced character independent nation seems distinguished token providential agency important revolution just accomplished system united government tranquil deliberations voluntary consent many distinct communities event resulted can compared means governments established without return pious gratitude along humble anticipation future blessings past seem presage reflections arising present crisis forced strongly mind suppressed will join trust thinking none influence proceedings new free government can auspiciously commence", 
## " article establishing executive department made duty president recommend consideration measures shall judge necessary expedient circumstances now meet will acquit entering subject refer great constitutional charter assembled defining powers designates objects attention given will consistent circumstances far congenial feelings actuate substitute place recommendation particular measures tribute due talents rectitude patriotism adorn characters selected devise adopt honorable qualifications behold surest pledges one side local prejudices attachments separate views party animosities will misdirect comprehensive equal eye watch great assemblage communities interests another foundation national policy will laid pure immutable principles private morality preeminence free government exemplified attributes can win affections citizens command respect world dwell prospect every satisfaction ardent love country can inspire since truth thoroughly established exists economy course nature indissoluble union virtue happiness duty advantage genuine maxims honest magnanimous policy solid rewards public prosperity felicity since less persuaded propitious smiles heaven can never expected nation disregards eternal rules order right heaven ordained since preservation sacred fire liberty destiny republican model government justly considered perhaps deeply finally staked experiment entrusted hands american people", 
## "besides ordinary objects submitted care will remain judgment decide far exercise occasional power delegated fifth article constitution rendered expedient present juncture nature objections urged system degree inquietude given birth instead undertaking particular recommendations subject guided lights derived official opportunities shall give way entire confidence discernment pursuit public good assure whilst carefully avoid every alteration might endanger benefits united effective government await future lessons experience reverence characteristic rights freemen regard public harmony will sufficiently influence deliberations question far former can impregnably fortified latter safely advantageously promoted", 
## " foregoing observations one add will properly addressed house representatives concerns will therefore brief possible first honored call service country eve arduous struggle liberties light contemplated duty required renounce every pecuniary compensation resolution instance departed still impressions produced must decline inapplicable share personal emoluments may indispensably included permanent provision executive department must accordingly pray pecuniary estimates station placed may continuance limited actual expenditures public good may thought require", 
## " thus imparted sentiments awakened occasion brings us together shall take present leave without resorting benign parent human race humble supplication since pleased favor american people opportunities deliberating perfect tranquillity dispositions deciding unparalleled unanimity form government security union advancement happiness divine blessing may equally conspicuous enlarged views temperate consultations wise measures success government must depend", ""), meta = list(author = character(0), datetimestamp = list(
##     sec = 36.146283864975, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0))))
## list()
## list()

3.10 Type Check

Be sure to use the following script once you have completed preprocessing. This tells R to treat the preprocessed documents as text documents.

docs <- tm::tm_map(docs, stripWhitespace)
writeLines(as.character(docs[1]))
## list(list(content = c("", "fellowcitizens senate house representatives", "among vicissitudes incident life event filled greater anxieties notification transmitted order received th day present month one hand summoned country whose voice can never hear veneration love retreat chosen fondest predilection flattering hopes immutable decision asylum declining years— retreat rendered every day necessary well dear addition habit inclination frequent interruptions health gradual waste committed time hand magnitude difficulty trust voice country called sufficient awaken wisest experienced citizens distrustful scrutiny qualifications overwhelm despondence one inheriting inferior endowments nature unpracticed duties civil administration peculiarly conscious deficiencies conflict emotions dare aver faithful study collect duty just appreciation every circumstance might affected dare hope executing task much swayed grateful remembrance former instances affectionate sensibility transcendent proof confidence fellowcitizens thence little consulted incapacity well disinclination weighty untried cares error will palliated motives mislead see app note consequences judged country share partiality originated", 
## " impressions obedience public summons repaired present station peculiarly improper omit first official act fervent supplications almighty rules universe presides councils nations whose providential aids can supply every human defect benediction may consecrate liberties happiness people united states government instituted essential purposes may enable every instrument employed administration execute success functions allotted charge tendering homage great author every public private good assure expresses sentiments less fellowcitizens large less either people can bound acknowledge adore invisible hand conducts affairs men united states every step advanced character independent nation seems distinguished token providential agency important revolution just accomplished system united government tranquil deliberations voluntary consent many distinct communities event resulted can compared means governments established without return pious gratitude along humble anticipation future blessings past seem presage reflections arising present crisis forced strongly mind suppressed will join trust thinking none influence proceedings new free government can auspiciously commence", 
## " article establishing executive department made duty president recommend consideration measures shall judge necessary expedient circumstances now meet will acquit entering subject refer great constitutional charter assembled defining powers designates objects attention given will consistent circumstances far congenial feelings actuate substitute place recommendation particular measures tribute due talents rectitude patriotism adorn characters selected devise adopt honorable qualifications behold surest pledges one side local prejudices attachments separate views party animosities will misdirect comprehensive equal eye watch great assemblage communities interests another foundation national policy will laid pure immutable principles private morality preeminence free government exemplified attributes can win affections citizens command respect world dwell prospect every satisfaction ardent love country can inspire since truth thoroughly established exists economy course nature indissoluble union virtue happiness duty advantage genuine maxims honest magnanimous policy solid rewards public prosperity felicity since less persuaded propitious smiles heaven can never expected nation disregards eternal rules order right heaven ordained since preservation sacred fire liberty destiny republican model government justly considered perhaps deeply finally staked experiment entrusted hands american people", 
## "besides ordinary objects submitted care will remain judgment decide far exercise occasional power delegated fifth article constitution rendered expedient present juncture nature objections urged system degree inquietude given birth instead undertaking particular recommendations subject guided lights derived official opportunities shall give way entire confidence discernment pursuit public good assure whilst carefully avoid every alteration might endanger benefits united effective government await future lessons experience reverence characteristic rights freemen regard public harmony will sufficiently influence deliberations question far former can impregnably fortified latter safely advantageously promoted", 
## " foregoing observations one add will properly addressed house representatives concerns will therefore brief possible first honored call service country eve arduous struggle liberties light contemplated duty required renounce every pecuniary compensation resolution instance departed still impressions produced must decline inapplicable share personal emoluments may indispensably included permanent provision executive department must accordingly pray pecuniary estimates station placed may continuance limited actual expenditures public good may thought require", 
## " thus imparted sentiments awakened occasion brings us together shall take present leave without resorting benign parent human race humble supplication since pleased favor american people opportunities deliberating perfect tranquillity dispositions deciding unparalleled unanimity form government security union advancement happiness divine blessing may equally conspicuous enlarged views temperate consultations wise measures success government must depend", ""), meta = list(author = character(0), datetimestamp = list(
##     sec = 36.146283864975, min = 24, hour = 11, mday = 3, mon = 0, year = 123, wday = 2, yday = 2, isdst = 0), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0))))
## list()
## list()
nrow(df)
## [1] 59
#Elnaz

for(i in 1:nrow(df)) {       # for-loop over rows
  df_i <- df[i, ]
  name <- df_i$president
  year <- df_i$year
  text <- df_i$content
  file_name <- paste(as.character(year), 
                     as.character(name), 
                     sep="-")
  file_name <- paste(file_name, ".txt", 
                     sep="")
  loc <- paste("./data/pre_processed/", file_name, sep="")
  writeLines(as.character(docs[[i]]), loc)
}

3.11 Create Doc Term Matrix

dtm <- tm::DocumentTermMatrix(docs)   
dtm 
## <<DocumentTermMatrix (documents: 59, terms: 9495)>>
## Non-/sparse entries: 40113/520092
## Sparsity           : 93%
## Maximal term length: 23
## Weighting          : term frequency (tf)

Storing transpose of matrix

tdm <- tm::TermDocumentMatrix(docs)   
tdm  
## <<TermDocumentMatrix (terms: 9495, documents: 59)>>
## Non-/sparse entries: 40113/520092
## Sparsity           : 93%
## Maximal term length: 23
## Weighting          : term frequency (tf)

3.12 Organize by frequency

freq <- colSums(as.matrix(dtm))   
length(freq)   
## [1] 9495
ord <- order(freq)
m <- as.matrix(dtm)   
dim(m)  
## [1]   59 9495

Store the matrix to memory

#write.csv(m, file="DocumentTermMatrix.csv")   

3.13 Remove sparse words

#  Start by removing sparse terms:   
dtms <- removeSparseTerms(dtm, 0.2) # This makes a matrix that is 20% empty space, maximum.   
dtms
## <<DocumentTermMatrix (documents: 59, terms: 25)>>
## Non-/sparse entries: 1298/177
## Sparsity           : 12%
## Maximal term length: 10
## Weighting          : term frequency (tf)

4 Word Frequency

freq <- colSums(as.matrix(dtm))

Least frequent

head(table(freq), 20) 
## freq
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 4154 1404  788  535  359  291  226  177  162  146  112   85   82   90   43   59 
##   17   18   19   20 
##   49   35   47   38

The top number is the frequency with which words appear and the bottom number reflects how many words appear that frequently.

Most frequent:

tail(table(freq), 40) 
## freq
## 134 137 138 139 142 143 147 150 155 157 159 171 179 184 185 198 207 210 221 222 
##   1   1   2   1   1   1   1   1   1   2   1   1   1   2   1   1   1   1   1   1 
## 227 232 240 250 256 267 302 303 304 314 318 337 341 346 373 374 488 567 576 942 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1

View a table of the terms we selected when we removed sparse terms in subsection Remove sparse words

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)     
freq |> head(20)
##       will     people government        can       upon       must      great 
##        942        576        567        488        374        373        346 
##        may     states      world      shall    country     nation      every 
##        341        337        318        314        304        303        302 
##        one      peace        new      power        now     public 
##        267        256        250        240        232        227

Identify all terms that appear frequently

findFreqTerms(dtm, lowfreq=50) |> head(20)
##  [1] "act"            "action"         "administration" "also"          
##  [5] "always"         "america"        "american"       "americans"     
##  [9] "among"          "another"        "authority"      "become"        
## [13] "believe"        "best"           "better"         "beyond"        
## [17] "business"       "called"         "can"            "cause"

Another approach to perform the same task:

wf <- data.frame(word=names(freq), freq=freq)   
head(wf) 
##                  word freq
## will             will  942
## people         people  576
## government government  567
## can               can  488
## upon             upon  374
## must             must  373

4.1 Plot

p <- ggplot(subset(wf, freq>100), aes(x = reorder(word, -freq), y = freq)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x=element_text(angle=45, hjust=1))

p   

5 Relationships Between Terms

tm::findAssocs(dtm, c("government" , "states"), corlimit=0.75)
## $government
## system 
##   0.79 
## 
## $states
##   powers sections   united 
##     0.77     0.76     0.76
findAssocs(dtms, "government", corlimit=0.70) # specifying a correlation limit of 0.95   
## $government
## states 
##   0.75

Word Clouds

Plot words that occur at least 25 times.

Colorized version:

set.seed(142)   
wordcloud::wordcloud(names(freq), freq, min.freq=20, scale=c(5, .1), colors=brewer.pal(6, "Dark2")) 

Plot words that occur at least 100 times.

set.seed(142)   
dark2 <- brewer.pal(6, "Dark2")   
wordcloud::wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)  

6 Clustering by Term Similarity

6.1 Hierarchal Clustering

d <- dist(t(dtms), method="euclidian")   
fit <- hclust(d=d, method="complete")   # for a different look try substituting: method="ward.D"
fit   
## 
## Call:
## hclust(d = d, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 25
plot(fit, hang=-1)

plot.new()
plot(fit, hang=-1)
groups <- cutree(fit, k=6)   # "k=" defines the number of clusters you are using   
rect.hclust(fit, k=6, border="red") # draw dendogram with red borders around the 6 clusters

6.2 K-means Clustering

d <- dist(t(dtms), method="euclidian")   
kfit <- kmeans(d, 2)   
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

d <- dist(t(dtms), method="euclidian")   
kfit <- kmeans(d, 4)   
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

7 Doc Simlarity

//TODO: Perform doc similarity using the textreuse library, analyze and visualize reults.

#loc <- "/home/hamed/Documents/R/MADS-NLP/data/texts"
#docs <- tm::VCorpus(DirSource(loc)) 

loc <- "./data/pre_processed/"#Elnaz
corpus <- TextReuseCorpus(dir=loc)


comparisons <- pairwise_compare(corpus, jaccard_similarity)
compare_df <- pairwise_candidates(comparisons)
compare_df <- as.data.frame(compare_df, 
                            col.names = names(compare_df))
#compare_df <- compare_df[order(compare_df$score,decreasing=TRUE)]
compare_df <- compare_df[order(compare_df$score,decreasing=TRUE),]
compare_df |> head(3)
##                          a                          b       score
## 38  1789-George Washington 1941-Franklin D. Roosevelt 0.008675079
## 879    1857-James Buchanan         1973-Richard Nixon 0.006329114
## 182  1801-Thomas Jefferson         1845-James K. Polk 0.006201044
#Najada
corpus
## TextReuseCorpus
## Number of documents: 59 
## hash_func : hash_string 
## tokenizer : tokenize_ngrams
writeLines(as.character(corpus[1]))
## list(`1789-George Washington` = list(content = "\nfellowcitizens senate house representatives\namong vicissitudes incident life event filled greater anxieties notification transmitted order received th day present month one hand summoned country whose voice can never hear veneration love retreat chosen fondest predilection flattering hopes immutable decision asylum declining years— retreat rendered every day necessary well dear addition habit inclination frequent interruptions health gradual waste committed time hand magnitude difficulty trust voice country called sufficient awaken wisest experienced citizens distrustful scrutiny qualifications overwhelm despondence one inheriting inferior endowments nature unpracticed duties civil administration peculiarly conscious deficiencies conflict emotions dare aver faithful study collect duty just appreciation every circumstance might affected dare hope executing task much swayed grateful remembrance former instances affectionate sensibility transcendent proof confidence fellowcitizens thence little consulted incapacity well disinclination weighty untried cares error will palliated motives mislead see app note consequences judged country share partiality originated\n impressions obedience public summons repaired present station peculiarly improper omit first official act fervent supplications almighty rules universe presides councils nations whose providential aids can supply every human defect benediction may consecrate liberties happiness people united states government instituted essential purposes may enable every instrument employed administration execute success functions allotted charge tendering homage great author every public private good assure expresses sentiments less fellowcitizens large less either people can bound acknowledge adore invisible hand conducts affairs men united states every step advanced character independent nation seems distinguished token providential agency important revolution just accomplished system united government tranquil deliberations voluntary consent many distinct communities event resulted can compared means governments established without return pious gratitude along humble anticipation future blessings past seem presage reflections arising present crisis forced strongly mind suppressed will join trust thinking none influence proceedings new free government can auspiciously commence\n article establishing executive department made duty president recommend consideration measures shall judge necessary expedient circumstances now meet will acquit entering subject refer great constitutional charter assembled defining powers designates objects attention given will consistent circumstances far congenial feelings actuate substitute place recommendation particular measures tribute due talents rectitude patriotism adorn characters selected devise adopt honorable qualifications behold surest pledges one side local prejudices attachments separate views party animosities will misdirect comprehensive equal eye watch great assemblage communities interests another foundation national policy will laid pure immutable principles private morality preeminence free government exemplified attributes can win affections citizens command respect world dwell prospect every satisfaction ardent love country can inspire since truth thoroughly established exists economy course nature indissoluble union virtue happiness duty advantage genuine maxims honest magnanimous policy solid rewards public prosperity felicity since less persuaded propitious smiles heaven can never expected nation disregards eternal rules order right heaven ordained since preservation sacred fire liberty destiny republican model government justly considered perhaps deeply finally staked experiment entrusted hands american people\nbesides ordinary objects submitted care will remain judgment decide far exercise occasional power delegated fifth article constitution rendered expedient present juncture nature objections urged system degree inquietude given birth instead undertaking particular recommendations subject guided lights derived official opportunities shall give way entire confidence discernment pursuit public good assure whilst carefully avoid every alteration might endanger benefits united effective government await future lessons experience reverence characteristic rights freemen regard public harmony will sufficiently influence deliberations question far former can impregnably fortified latter safely advantageously promoted\n foregoing observations one add will properly addressed house representatives concerns will therefore brief possible first honored call service country eve arduous struggle liberties light contemplated duty required renounce every pecuniary compensation resolution instance departed still impressions produced must decline inapplicable share personal emoluments may indispensably included permanent provision executive department must accordingly pray pecuniary estimates station placed may continuance limited actual expenditures public good may thought require\n thus imparted sentiments awakened occasion brings us together shall take present leave without resorting benign parent human race humble supplication since pleased favor american people opportunities deliberating perfect tranquillity dispositions deciding unparalleled unanimity form government security union advancement happiness divine blessing may equally conspicuous enlarged views temperate consultations wise measures success government must depend\n", 
##     tokens = NULL, hashes = c(587687171, 1400508662, -1528904364, -1176008363, 2104028628, 242661590, -1305151396, -1007533842, 1537695272, 716812531, 1724523801, -2029856456, -1559951769, 248799671, 751783772, 1056616327, -365629900, 2028634587, -1779240488, 584989554, 966616660, 2023068956, -461585151, 1235627038, 1461987856, 2109740839, 296398647, 1516495441, -1759252034, -950716712, -1700284989, 474818159, -2145645941, 959005317, 1792189935, 526937258, 1320515610, -1870374867, 533049850, -1667500355, 
##     -2013349359, 1181689723, 2091170364, 1446288897, -1291093787, 678659832, -66358167, -954065810, -1186750319, 397886169, 1947985790, 1787954948, -1376396193, 2024928609, -1993756363, -2035282470, 1709616450, -1142690907, -87400543, -1780932891, 1275769164, -1238247474, -1795675854, 2135756502, -147093800, -555192427, -1320760355, 649342408, 443548482, 61830623, 198592979, -603050151, -1728163943, -1792610003, -51009840, -1302594033, 996085435, 1526349563, 678044707, 453523929, 586226657, -1699176042, 
##     169265343, -515095572, -395221959, -99484685, -294414424, -1126964556, -1403486852, 816456272, 733257215, 920525667, -345995926, -1823464731, 1147597892, 1354415655, 1854078669, -1621228001, 1927301929, -1345522747, 1038471858, -244681622, -892578995, 72381548, 1270857288, 849915209, 993045926, -1941284721, 1545847062, 38047156, 698790758, 1074535113, -113479895, -1989511143, -1755919354, 1745815471, 2025787534, -1278537472, 1513749684, -723464549, 1796515351, -512470495, -1772972627, -103986019, 
##     -541837969, -332140653, -41938622, 935988054, -287721816, 226643687, 254446636, -1205363655, -81339928, 1916956172, -1114498525, 1973504928, -934931095, 708855922, -1888749922, 2087149692, 2144294272, 1593169418, -1927058696, -1777509480, -1664836722, -1020860280, -1502327335, -968650868, -649366632, -494834276, -2024279898, 410558282, 1189368521, 1323869532, 1338465938, 467733909, -1018544674, 1969474929, -371125453, -9171236, 1519063877, 1286598646, 1424388929, 186099284, -1770712336, -1443119936, 
##     831240281, 167302766, 1386444513, 2005304832, -202627190, -834445537, -1368329272, 380808289, -551303023, 1934873975, -154147308, 945961261, 625181640, 1874442376, -1073232328, -193499807, 714424653, -515533330, 2128767955, 562009042, 1404304799, 1619715984, -946751896, -869481633, 875871073, -1770942317, -634084209, 1231605490, -495036420, -2068560466, 2031343882, 536520523, 1063547700, 1637046830, -1286846358, -1211670658, -757319013, 1335579505, -645935361, 2135177199, -541915347, 1330624931, 
##     636932188, 1522355160, 1258497539, 1048273565, 557472728, -1156793337, 1297653750, 1853178493, 37273762, -956435664, -277103692, 770234992, -1152061625, 299770228, -180120082, 965657393, 1427881378, -121825537, -593430032, 1120961923, 1853608939, -227009182, -545589378, 604343275, 54439085, -1551532025, -207087041, 66678883, 1925704464, -2045922252, 129959685, 1861493628, -1964043475, 942061079, 987665613, -916763261, -1163102381, -884820054, 496757108, -1440161153, -128751657, 2126468541, 506779190, 
##     -690778854, -470997245, -1596300445, -668638245, -1325980228, -953995496, 1876751852, -700155755, 564026092, -1567717831, -987311879, 1971826499, 428076561, -1967041995, 1994730386, -709766214, -709741389, 2140793596, 1297607958, 1901773077, -1117184829, 305571156, 359973086, -111077089, 959816120, -984768994, -280513398, -549216946, -299548882, 847788327, -791141545, 721879760, 461320220, -1735365414, 65188856, 1592314897, 728532517, 1870809411, -93141376, -1694462618, 467593226, 923382187, 
##     1985266923, 1998388686, -1345495541, 819029135, 2001137044, -204338349, 1315007312, 1149535113, -1738771752, 57956568, -1725259379, 383530898, -812394143, -291585477, -1781070278, -38337638, -2081460016, 2069817247, 1162777367, 1368678671, 712542515, -140075497, 1506746088, 384065393, -405588973, 1902955964, 202742759, -1045686688, -753651315, 1110107017, -1921545089, 1652269712, -2095806876, -1480277942, 585069962, 776673625, 861096816, 983589687, -1509598572, 841473833, -2101045069, -1105440689, 
##     1116502339, -589244334, -627210225, 1558977457, -1187165814, -2072325718, 625008151, 1412345682, 119942981, 1353418914, -582658888, -1354122991, 729582651, 805068727, -380399197, -985862831, -138779692, -2016268868, -816225859, -1513153725, -1760765814, -1232844426, -1439682084, 1949345750, -1562470364, 1150362467, -109966971, 1474149194, 53748794, -1358513161, 1294174501, -109789284, 1235827108, 151781260, -1401449250, -308703952, -1352487516, 320940982, -2142548884, 8175128, -396966452, -1118882259, 
##     207823984, -1252670205, -1932639948, -2076541997, 995440370, -16760181, -749012868, -1424971204, 2095115152, 2130148753, 355575399, 212603945, -715715317, -1039787095, 1548652465, -926828080, 209190793, 389547656, 770650920, 519071644, 1721560869, -2055550679, 995539959, 1349426469, -272386170, -1054033410, -1388708457, 2017121382, -1302657021, 976160707, -915577174, -1401844529, -1058969706, 430503161, -1586187242, -1676540359, -443313598, 1348543069, 1304320134, 861286452, 72735592, -1627571465, 
##     1134277041, -1959208081, 864590062, 574606785, -1112220517, -1428687595, -1783787317, -1400688237, -1450441702, -2124775292, -1807714059, 399195817, -277514201, 1660255958, 592346957, -1946457483, -432533824, -1974694602, 310646559, -802576598, -974357456, 317468672, 612727181, -1496191349, -692697901, -874521377, 786167592, -2122350516, 1062919638, -708757281, -1461388590, 1056073807, 1340225372, 781897539, -1973499855, -1795965144, 965418590, 746919453, -1116538561, -1785986764, -1649185227, 
##     382250480, -436911901, 503703024, 1842305762, 1425240121, 1642198257, 1840809897, -1086012836, -721967427, 1413207480, 743667063, 402880650, 1393967382, 571580618, 282186835, -191395883, 1866940704, -1725704000, 2091777396, 614036410, -1684721682, -1608505298, -1223100962, 975572501, -1892854512, 1448058632, 1673360040, 1511845018, 950228943, -1644886043, -672037095, -1466344035, 869303991, -1792986760, -1219066159, -1553177641, 1211158170, 2047196414, -480494287, 1642867115, -1039588290, -2098084322, 
##     580935991, 1062508689, 1947468937, -293209911, 316754159, -2119615852, 1544563932, -1272015818, 1290136501, 1752833319, -628275304, -1957030177, -2123663366, -1854701749, 1538253965, -1276970472, 954249127, 490002465, 366039354, 446832742, 2097967162, 1628197313, -2106802246, -745737230, -1773610017, -1509061043, 1969614321, 1667617418, -1775572338, -61528855, -881661575, 758075731, 808992127, 407763413, -1685772086, 88988503, -485215697, 1224660249, 1262754192, 1316360811, 730725556, -1895962040, 
##     739632466, 112190582, -760569821, -696357463, 321222347, -932152648, -181125678, 1331213186, -37250300, 1785451918, 1984911491, 1049752826, 171265737, 648581895, 84716172, -1935422683, -1887938612, 1730609164, -891045666, 640952152, 675967154, 1025210682, -1590612499, 241213893, -1779016761, -772953297, 371142435, -725054360, -1504138522, 806480980, 966180365, -798098067, 1556657824, 547101515, 1446227486, 647201881, 2078758924, 1674319440, -676763924, 7601371, 308556602, -2121544895, 111915176, 
##     754845568, 418118757, 1097402806, 1251110189, -1332834031, 894982345, 542760788, 310690177, 2011239982, -1556430381, 2022761016, -1006829461, -1346320224, -508244653, 843396633, -2107491768, 661235271, 1594638859, -847261, -1861551742, 1991168055, -578164503, -1391384292, -2112455287, 575323136, -506097748, 2027091026, 684144166, 906872019, -1791087308, -1991936115, 919794053, 1345727137, -1368100382, 1416044718, 1301064130, 1932237702, 792666683, -2028606752, -837382500, -1555846557, 844852165, 
##     -821236830, -1225523181, 564712667, -967978313, -391403513, 532628575, -654336907, 1319076328, -1284386717, -1450290552, 1976147344, -1875527371, -1903869155, 613479004, -1383717481, -802577061, 1592465726, 95855341, 1720498184, -68535017, 609372993, 440864016, -1882282981, 220356884, 323636372, -911290700, 1302715528, 885234303, -228890949, 1232542523, 1752449529, 48313893), minhashes = NULL, meta = list(file = "./data/pre_processed//1789-George Washington.txt", hash_func = "hash_string", id = "1789-George Washington", 
##         minhash_func = NULL, tokenizer = "tokenize_ngrams")))
## list(hash_func = "hash_string", tokenizer = "tokenize_ngrams")

7.1 Similarity Score Plot

#Choosing only the first 30 rows because otherwise the plot becomes unreadable since there are too many points
compare_df_viz <- compare_df[1:100, ]
# Converting names to initials
compare_df_viz$a <- gsub("(?<=[A-Z])[^A-Z]+", "", compare_df_viz$a ,perl = TRUE)

compare_df_viz$b <- gsub("(?<=[A-Z])[^A-Z]+", "", compare_df_viz$b ,perl = TRUE)
fig <- plot_ly(compare_df_viz, x = ~a, y = ~b, z = ~score, color=~score, size=~score)
fig <- fig |> add_markers()
fig <- fig |> layout(scene = list(xaxis = list(title = 'Doc1'),
                     yaxis = list(title = 'Doc2'),
                     zaxis = list(title = 'Similarity Score')
                     ))

fig
## Warning: `line.width` does not currently support multiple values.

Cosine Similarity

cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(dtms)/(sqrt(col_sums(dtms^2) %*% t(col_sums(dtms^2))))

cosine_dist_mat
##             Terms
## Terms              can   country     every      free    future      good
##   can        0.0000000 0.3013170 0.3196871 0.3970037 0.3110346 0.2664495
##   country    0.3013170 0.0000000 0.2134492 0.3652111 0.4106707 0.3386620
##   every      0.3196871 0.2134492 0.0000000 0.4134426 0.2906490 0.3322699
##   free       0.3970037 0.3652111 0.4134426 0.0000000 0.4161580 0.3816995
##   future     0.3110346 0.4106707 0.2906490 0.4161580 0.0000000 0.2954737
##   good       0.2664495 0.3386620 0.3322699 0.3816995 0.2954737 0.0000000
##   government 0.2579230 0.2047350 0.2285687 0.4167750 0.4221402 0.3390539
##   great      0.3042754 0.2390763 0.2096882 0.4302002 0.3422671 0.3008674
##   just       0.3563657 0.3436575 0.2022730 0.4530560 0.2901312 0.4011886
##   life       0.3891276 0.5365314 0.4278774 0.3598682 0.3989592 0.3532778
##   may        0.2454736 0.2091903 0.2500473 0.3684674 0.4220194 0.2937487
##   must       0.2257792 0.3539029 0.3447757 0.3947449 0.2996883 0.3999765
##   nation     0.2810751 0.4035222 0.3253524 0.5014245 0.2368747 0.2633727
##   nations    0.3268656 0.3906216 0.4437372 0.3606915 0.3747685 0.4074177
##   new        0.3658506 0.6269561 0.4465935 0.5652935 0.3712807 0.4494017
##   now        0.2702416 0.3961642 0.3232435 0.4636810 0.3613852 0.3781249
##   one        0.1981736 0.2578042 0.2801983 0.3996113 0.3642686 0.3632012
##   people     0.1876246 0.2312582 0.2164757 0.3129386 0.3493964 0.2671543
##   power      0.3523297 0.3045336 0.3592473 0.4283691 0.4823880 0.3825696
##   shall      0.3109780 0.3616201 0.3589731 0.3534992 0.4197767 0.3228838
##   states     0.3775741 0.2668935 0.2919410 0.3686965 0.5228231 0.4072595
##   time       0.2443284 0.3741071 0.2302306 0.3895374 0.1702277 0.3522756
##   united     0.3648980 0.2623717 0.2403582 0.3262696 0.4483419 0.3755336
##   will       0.1884575 0.2630200 0.2057079 0.3901750 0.2137291 0.2273583
##   world      0.3037327 0.5509632 0.5317638 0.4174611 0.3732763 0.4578542
##             Terms
## Terms        government     great      just      life       may      must
##   can         0.2579230 0.3042754 0.3563657 0.3891276 0.2454736 0.2257792
##   country     0.2047350 0.2390763 0.3436575 0.5365314 0.2091903 0.3539029
##   every       0.2285687 0.2096882 0.2022730 0.4278774 0.2500473 0.3447757
##   free        0.4167750 0.4302002 0.4530560 0.3598682 0.3684674 0.3947449
##   future      0.4221402 0.3422671 0.2901312 0.3989592 0.4220194 0.2996883
##   good        0.3390539 0.3008674 0.4011886 0.3532778 0.2937487 0.3999765
##   government  0.0000000 0.2538097 0.3284080 0.5131929 0.1897588 0.3346813
##   great       0.2538097 0.0000000 0.2831543 0.4549428 0.2135467 0.3918152
##   just        0.3284080 0.2831543 0.0000000 0.4844806 0.3634034 0.4016780
##   life        0.5131929 0.4549428 0.4844806 0.0000000 0.5740470 0.3630353
##   may         0.1897588 0.2135467 0.3634034 0.5740470 0.0000000 0.3689675
##   must        0.3346813 0.3918152 0.4016780 0.3630353 0.3689675 0.0000000
##   nation      0.4031149 0.3087375 0.3363669 0.2829575 0.4428263 0.3346222
##   nations     0.4886689 0.4644433 0.4336468 0.3544381 0.4583313 0.4073543
##   new         0.5282904 0.4976656 0.4564718 0.3729945 0.6337560 0.3379381
##   now         0.3539362 0.3478060 0.4525142 0.4819565 0.3803697 0.3446933
##   one         0.2253451 0.3185289 0.4083071 0.5308138 0.2428221 0.3131816
##   people      0.1561980 0.2122676 0.3205800 0.4053061 0.1893500 0.2844634
##   power       0.3144943 0.3182054 0.5045919 0.6500729 0.1761881 0.5603424
##   shall       0.2626669 0.3262349 0.3740483 0.4798238 0.2220753 0.4286209
##   states      0.1503715 0.2484336 0.3144041 0.6534657 0.1793855 0.5053126
##   time        0.3544366 0.2798467 0.2648991 0.3886612 0.3363113 0.2596588
##   united      0.2204035 0.1972449 0.3001157 0.5073253 0.2731160 0.4412357
##   will        0.2554974 0.2392831 0.2468342 0.3862783 0.2956313 0.2099962
##   world       0.5681668 0.5193071 0.5327552 0.3016199 0.5944010 0.2184216
##             Terms
## Terms           nation   nations       new       now       one    people
##   can        0.2810751 0.3268656 0.3658506 0.2702416 0.1981736 0.1876246
##   country    0.4035222 0.3906216 0.6269561 0.3961642 0.2578042 0.2312582
##   every      0.3253524 0.4437372 0.4465935 0.3232435 0.2801983 0.2164757
##   free       0.5014245 0.3606915 0.5652935 0.4636810 0.3996113 0.3129386
##   future     0.2368747 0.3747685 0.3712807 0.3613852 0.3642686 0.3493964
##   good       0.2633727 0.4074177 0.4494017 0.3781249 0.3632012 0.2671543
##   government 0.4031149 0.4886689 0.5282904 0.3539362 0.2253451 0.1561980
##   great      0.3087375 0.4644433 0.4976656 0.3478060 0.3185289 0.2122676
##   just       0.3363669 0.4336468 0.4564718 0.4525142 0.4083071 0.3205800
##   life       0.2829575 0.3544381 0.3729945 0.4819565 0.5308138 0.4053061
##   may        0.4428263 0.4583313 0.6337560 0.3803697 0.2428221 0.1893500
##   must       0.3346222 0.4073543 0.3379381 0.3446933 0.3131816 0.2844634
##   nation     0.0000000 0.3911856 0.2389233 0.3628019 0.3765086 0.2904789
##   nations    0.3911856 0.0000000 0.4748058 0.4749950 0.5669782 0.3968177
##   new        0.2389233 0.4748058 0.0000000 0.4120310 0.4726255 0.4367703
##   now        0.3628019 0.4749950 0.4120310 0.0000000 0.3042476 0.2278992
##   one        0.3765086 0.5669782 0.4726255 0.3042476 0.0000000 0.2424710
##   people     0.2904789 0.3968177 0.4367703 0.2278992 0.2424710 0.0000000
##   power      0.4872611 0.6527536 0.6956918 0.5153625 0.2507013 0.2587369
##   shall      0.4901205 0.4562546 0.5808853 0.3135053 0.4146651 0.2428110
##   states     0.5151173 0.5438493 0.6967915 0.3601278 0.3113200 0.2187172
##   time       0.2190216 0.4210667 0.2539761 0.2155948 0.2571156 0.2440165
##   united     0.4477340 0.4010402 0.6261884 0.3173412 0.3682137 0.2665430
##   will       0.2334926 0.3713594 0.3063816 0.1803612 0.2196700 0.1838601
##   world      0.3381502 0.2622823 0.2473251 0.4499794 0.4556484 0.4291283
##             Terms
## Terms            power     shall    states      time    united      will
##   can        0.3523297 0.3109780 0.3775741 0.2443284 0.3648980 0.1884575
##   country    0.3045336 0.3616201 0.2668935 0.3741071 0.2623717 0.2630200
##   every      0.3592473 0.3589731 0.2919410 0.2302306 0.2403582 0.2057079
##   free       0.4283691 0.3534992 0.3686965 0.3895374 0.3262696 0.3901750
##   future     0.4823880 0.4197767 0.5228231 0.1702277 0.4483419 0.2137291
##   good       0.3825696 0.3228838 0.4072595 0.3522756 0.3755336 0.2273583
##   government 0.3144943 0.2626669 0.1503715 0.3544366 0.2204035 0.2554974
##   great      0.3182054 0.3262349 0.2484336 0.2798467 0.1972449 0.2392831
##   just       0.5045919 0.3740483 0.3144041 0.2648991 0.3001157 0.2468342
##   life       0.6500729 0.4798238 0.6534657 0.3886612 0.5073253 0.3862783
##   may        0.1761881 0.2220753 0.1793855 0.3363113 0.2731160 0.2956313
##   must       0.5603424 0.4286209 0.5053126 0.2596588 0.4412357 0.2099962
##   nation     0.4872611 0.4901205 0.5151173 0.2190216 0.4477340 0.2334926
##   nations    0.6527536 0.4562546 0.5438493 0.4210667 0.4010402 0.3713594
##   new        0.6956918 0.5808853 0.6967915 0.2539761 0.6261884 0.3063816
##   now        0.5153625 0.3135053 0.3601278 0.2155948 0.3173412 0.1803612
##   one        0.2507013 0.4146651 0.3113200 0.2571156 0.3682137 0.2196700
##   people     0.2587369 0.2428110 0.2187172 0.2440165 0.2665430 0.1838601
##   power      0.0000000 0.4360201 0.3234218 0.4166865 0.3998290 0.4231578
##   shall      0.4360201 0.0000000 0.2449347 0.3897523 0.3418936 0.2887096
##   states     0.3234218 0.2449347 0.0000000 0.4248667 0.1442522 0.3265694
##   time       0.4166865 0.3897523 0.4248667 0.0000000 0.3697408 0.1750546
##   united     0.3998290 0.3418936 0.1442522 0.3697408 0.0000000 0.3062891
##   will       0.4231578 0.2887096 0.3265694 0.1750546 0.3062891 0.0000000
##   world      0.7102814 0.5616665 0.7295113 0.3166116 0.5751415 0.3533194
##             Terms
## Terms            world
##   can        0.3037327
##   country    0.5509632
##   every      0.5317638
##   free       0.4174611
##   future     0.3732763
##   good       0.4578542
##   government 0.5681668
##   great      0.5193071
##   just       0.5327552
##   life       0.3016199
##   may        0.5944010
##   must       0.2184216
##   nation     0.3381502
##   nations    0.2622823
##   new        0.2473251
##   now        0.4499794
##   one        0.4556484
##   people     0.4291283
##   power      0.7102814
##   shall      0.5616665
##   states     0.7295113
##   time       0.3166116
##   united     0.5751415
##   will       0.3533194
##   world      0.0000000

Check if there is any na or infinite values in the dataframe:

apply(compare_df,2, function(x) is.na(x) | is.infinite(x)) |> any()
## [1] FALSE

7.2 Clustering Docs

clustering methods

First we pivot the score dataframe to construct a distance matrix

distance_mat <- compare_df |> pivot_wider(names_from=a, values_from=score)

7.2.1 K-modes clustering

cl <- kmodes(compare_df,2)
plot(compare_df, col=cl$cluster)

#Najada-- clusters for the results of cosine similarity
d <- dist(t(cosine_dist_mat), method="euclidian")   
kfit <- kmeans(d, 4)   
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

#What are we trying to do here?
dtms[,1]
## <<DocumentTermMatrix (documents: 59, terms: 1)>>
## Non-/sparse entries: 56/3
## Sparsity           : 5%
## Maximal term length: 3
## Weighting          : term frequency (tf)
cosine_sim <- tcrossprod_simple_triplet_matrix(dtms[,1], dtms[,2])/sqrt(row_sums(dtms[,2]^2) %*% t(row_sums(dtms[,1]^2)))

#cosine_sim

8 Conclusion

//TODO: writing